Applying our zero-inflated model to the olfactory data.
We select only the cells that pass QC, color coded by the experimental time point. Note that the data are not public, hence it should not work other than on Davide’s computer.
I think Kelly said the data are now publicly available. How do we get the publicly available dataset?
load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.
dim(core)
## [1] 1000 616
core[1:3, 1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 8763 7221 3581
## Cbr2 5799 3638 1448
## Cyp2f2 2158 2027 1078
Cells have been processed in 18 different batches
table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B P01 P02 P03A P03B P04 P05
## 36 33 30 17 25 43 37 33 14 20
## P06 P10 P11 P12 P13 P14 Y01 Y04
## 39 36 44 40 47 39 51 32
Cells have been clustered into 13 different clusters
table(clus.labels)
## clus.labels
## 1 2 3 4 5 7 8 9 10 11 12 14 15
## 91 25 56 40 96 60 28 79 26 22 35 26 32
Batches are kind of confounded with the biology
table(data.frame(batch = as.vector(batch),
cluster = clus.labels))
## cluster
## batch 1 2 3 4 5 7 8 9 10 11 12 14 15
## GBC08A 0 2 12 9 0 0 0 0 0 2 0 2 9
## GBC08B 0 7 5 3 0 0 0 1 2 4 0 5 6
## GBC09A 0 1 5 9 0 0 0 1 1 0 0 6 7
## GBC09B 0 2 2 7 0 0 0 3 0 0 0 3 0
## P01 0 2 4 3 15 1 0 0 0 0 0 0 0
## P02 2 0 9 3 15 3 3 2 3 0 2 1 0
## P03A 3 0 3 0 12 2 9 4 2 0 2 0 0
## P03B 1 2 1 1 11 1 2 10 1 1 2 0 0
## P04 0 0 0 0 11 1 0 1 1 0 0 0 0
## P05 0 0 0 1 11 3 0 1 0 2 2 0 0
## P06 1 2 3 0 8 2 4 8 4 1 2 2 2
## P10 3 1 4 0 4 5 9 2 0 2 5 0 1
## P11 2 1 1 0 1 5 1 22 3 1 6 0 1
## P12 0 2 0 0 4 10 0 8 2 3 6 4 1
## P13 1 2 4 0 4 15 0 4 5 6 1 3 2
## P14 0 0 1 2 0 12 0 12 2 0 7 0 3
## Y01 47 1 1 2 0 0 0 0 0 0 0 0 0
## Y04 31 0 1 0 0 0 0 0 0 0 0 0 0
Could you remind me where the clusters come from?
For each cell, we dispose of qc measures
head(qc, 2)
## NREADS NALIGNED RALIGN TOTAL_DUP PRIMER
## OEP01_N706_S501 3313260 3167600 95.6035 47.9943 0.0154566
## OEP01_N701_S501 2902430 2757790 95.0167 45.0150 0.0182066
## PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES
## OEP01_N706_S501 2e-06 0.200130 0.230654
## OEP01_N701_S501 0e+00 0.182461 0.201810
## PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES
## OEP01_N706_S501 0.404205 0.165009 0.430784
## OEP01_N701_S501 0.465702 0.150027 0.384271
## MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS
## OEP01_N706_S501 0.843857 0.061028 0.521079
## OEP01_N701_S501 0.914370 0.033350 0.373993
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")
fq <- EDASeq::betweenLaneNormalization(counts, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")
The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.
Do we have a reference for this paper?
Is it a fair comparison?? for pca on log counts, we use the 1000 most variable genes, but for full quantile normalization, we use all the genes. I understand that to perform full quantile normalization we need all the genes, but then should we look at only the 1000 most variable genes defined above, no?
They found that to fully explain the differences between clusters, we need three dimensions.
pairs(pcafq$x[,1:3], col=colMerged, pch=19,
main="PCA of FQ log-counts, centered not scaled")
df <- data.frame(PC1 = pcafq$x[, 1],
PC2 = pcafq$x[, 2],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch = 19, col = colMerged)
print(cor(df, method = "spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.0432585 0.2643194 -0.02214266
## PC2 -0.04325850 1.0000000 0.6508453 0.19979644
## detection_rate 0.26431940 0.6508453 1.0000000 0.41994811
## coverage -0.02214266 0.1997964 0.4199481 1.00000000
Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate. Note that, as expected, detection rate and coverage are correlated.
totalcount = function (ei) {
sums = colSums(ei)
eo = t(t(ei)*mean(sums)/sums)
return(eo)
}
raw <- as.matrix(counts)
tc <- totalcount(raw)
fq <- FQT_FN(raw)
tmm <- TMM_FN(raw)
fastpca <- function(expr, scale=FALSE) {
svd_raw <- svds(scale(t(expr), center=TRUE, scale=scale), k=3, nu=3, nv=0)
pc_raw <- svd_raw$u %*% diag(svd_raw$d[1:3])
return(pc_raw)
}
vargenes <- rownames(core)
pc_raw <- fastpca(log1p(raw[vargenes,]))
pc_tc <- fastpca(log1p(tc[vargenes,]))
pc_fq <- fastpca(log1p(fq[vargenes,]))
pc_tmm <- fastpca(log1p(tmm[vargenes,]))
Let’s now look at zifa.
wrapRzifa <- function(Y, block = TRUE, k=2){
# wrapper R function for ZIFA.
# md5 hashing and temporary files are used not to re-run zifa
# if it has already be run on this computer.
d = digest(Y, "md5")
tmp = paste0(tempdir(), '/', d)
write.csv(Y, paste0(tmp, '.csv'))
if (!file.exists(paste0(tmp, "_", k, '_zifa.csv'))){
print('run ZIFA')
bb = ifelse(block, '-b ', '')
cmd = sprintf('python run_zifa.py -d %d %s%s.csv %s_%d_zifa.csv', k, bb, tmp, tmp, k)
system(cmd)
}
read.csv(sprintf("%s_%d_zifa.csv", tmp, k), header=FALSE)
}
zifa_raw <- wrapRzifa(log1p(raw[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tc <- wrapRzifa(log1p(tc[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tmm <- wrapRzifa(log1p(tmm[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_fq <- wrapRzifa(log1p(fq[vargenes,]), k=3)
## [1] "run ZIFA"
pairs(zifa_raw[,1:3], col=colMerged, pch=19,
main="ZIFA of raw log-counts")
pairs(zifa_fq[,1:3], col=colMerged, pch=19,
main="ZIFA of FQ log-counts")
We are going to look at different parameters for ZINB
Using only intercepts in X and V.
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
## user system elapsed
## 220.663 43.479 111.175
pairs(zinb_3@W, col = colMerged, pch = 19, main = "ZINB")
qcpca <- prcomp(qc, center = TRUE, scale. = TRUE)
df <- data.frame(W1 = zinb_3@W[,1],
W2 = zinb_3@W[,2],
W3 = zinb_3@W[,3],
gamma_mu = zinb_3@gamma_mu[1,],
gamma_pi = zinb_3@gamma_pi[1,],
detection_rate=detection_rate,
coverage = coverage,
quality = qcpca$x[,1])
pairs(df, pch = 19, col = colMerged)
print(cor(df, method = "spearman"))
## W1 W2 W3 gamma_mu gamma_pi
## W1 1.00000000 -0.02414526 -0.09064496 0.08205038 0.15669401
## W2 -0.02414526 1.00000000 -0.01166085 -0.03726806 0.20835852
## W3 -0.09064496 -0.01166085 1.00000000 -0.13109039 -0.01740836
## gamma_mu 0.08205038 -0.03726806 -0.13109039 1.00000000 -0.07958898
## gamma_pi 0.15669401 0.20835852 -0.01740836 -0.07958898 1.00000000
## detection_rate -0.15316526 -0.32159255 0.01930138 0.20659231 -0.97309429
## coverage 0.07916631 0.01772106 0.04078241 0.89501409 -0.32085525
## quality 0.05742306 0.11847496 -0.05564158 -0.55196048 0.30254576
## detection_rate coverage quality
## W1 -0.15316526 0.07916631 0.05742306
## W2 -0.32159255 0.01772106 0.11847496
## W3 0.01930138 0.04078241 -0.05564158
## gamma_mu 0.20659231 0.89501409 -0.55196048
## gamma_pi -0.97309429 -0.32085525 0.30254576
## detection_rate 1.00000000 0.41994811 -0.34747756
## coverage 0.41994811 1.00000000 -0.60944735
## quality -0.34747756 -0.60944735 1.00000000
print(system.time(zinb_5 <- zinbFit(core, ncores = 3, K = 5)))
## user system elapsed
## 274.465 42.470 131.200
pairs(zinb_5@W, col = colMerged, pch = 19, main = "ZINB")
print(system.time(zinb_10 <- zinbFit(core, ncores = 3, K = 10)))
## user system elapsed
## 482.195 59.792 212.380
pairs(zinb_10@W, col = colMerged, pch = 19, main = "ZINB")
X has an intercept and 5 additional columns with the first five PC on qc measures.
mod <- model.matrix(~ qcpca$x[, 1:5])
print(system.time(zinb_3_qc <- zinbFit(core, ncores = 3, K = 3, X = mod)))
## user system elapsed
## 363.469 46.409 159.956
pairs(zinb_3_qc@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1 = zinb_3_qc@W[,1],
W2 = zinb_3_qc@W[,2],
W3 = zinb_3_qc@W[,3],
gamma_mu = zinb_3_qc@gamma_mu[1,],
gamma_pi = zinb_3_qc@gamma_pi[1,],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch = 19, col = colMerged)
print(cor(df, method = "spearman"))
## W1 W2 W3 gamma_mu
## W1 1.00000000 -0.03466116 -0.021067844 0.154529488
## W2 -0.03466116 1.00000000 -0.021384549 -0.064591459
## W3 -0.02106784 -0.02138455 1.000000000 0.006667748
## gamma_mu 0.15452949 -0.06459146 0.006667748 1.000000000
## gamma_pi -0.02937107 -0.04003005 0.127357641 -0.281894727
## detection_rate 0.06250448 0.12131856 -0.160817375 0.466624520
## coverage 0.02182683 -0.10622722 -0.098061523 0.868247711
## gamma_pi detection_rate coverage
## W1 -0.02937107 0.06250448 0.02182683
## W2 -0.04003005 0.12131856 -0.10622722
## W3 0.12735764 -0.16081737 -0.09806152
## gamma_mu -0.28189473 0.46662452 0.86824771
## gamma_pi 1.00000000 -0.95438636 -0.26924454
## detection_rate -0.95438636 1.00000000 0.41994811
## coverage -0.26924454 0.41994811 1.00000000
print(system.time(zinb_5_qc <- zinbFit(core, ncores = 3, K = 5, X = mod)))
## user system elapsed
## 418.028 49.259 192.562
pairs(zinb_5_qc@W, col=colMerged, pch=19, main="ZINB")
print(system.time(zinb_10_qc <- zinbFit(core, ncores = 3, K = 10, X = mod)))
## user system elapsed
## 644.981 67.262 278.458
pairs(zinb_10_qc@W, col=colMerged, pch=19, main="ZINB")
X has an intercept and columns for the batchs.
class(batch)
## [1] "factor"
mod <- model.matrix(~ batch)
dim(mod)
## [1] 616 18
print(system.time(zinb_3_batch <- zinbFit(core, ncores = 3, K = 3, X = mod)))
## user system elapsed
## 420.864 49.121 178.302
pairs(zinb_3_batch@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1 = zinb_3@W[,1],
W2 = zinb_3@W[,2],
W3 = zinb_3@W[,3],
gamma_mu = zinb_3@gamma_mu[1,],
gamma_pi = zinb_3@gamma_pi[1,],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch = 19, col = colMerged)
print(cor(df, method = "spearman"))
## W1 W2 W3 gamma_mu gamma_pi
## W1 1.00000000 -0.02414526 -0.09064496 0.08205038 0.15669401
## W2 -0.02414526 1.00000000 -0.01166085 -0.03726806 0.20835852
## W3 -0.09064496 -0.01166085 1.00000000 -0.13109039 -0.01740836
## gamma_mu 0.08205038 -0.03726806 -0.13109039 1.00000000 -0.07958898
## gamma_pi 0.15669401 0.20835852 -0.01740836 -0.07958898 1.00000000
## detection_rate -0.15316526 -0.32159255 0.01930138 0.20659231 -0.97309429
## coverage 0.07916631 0.01772106 0.04078241 0.89501409 -0.32085525
## detection_rate coverage
## W1 -0.15316526 0.07916631
## W2 -0.32159255 0.01772106
## W3 0.01930138 0.04078241
## gamma_mu 0.20659231 0.89501409
## gamma_pi -0.97309429 -0.32085525
## detection_rate 1.00000000 0.41994811
## coverage 0.41994811 1.00000000
print(system.time(zinb_5_batch <- zinbFit(core, ncores = 3, K = 5, X = mod)))
## user system elapsed
## 501.115 60.472 237.849
pairs(zinb_5_batch@W, col=colMerged, pch=19, main="ZINB")
print(system.time(zinb_10_batch <- zinbFit(core, ncores = 3, K = 10, X = mod)))
## user system elapsed
## 764.353 123.040 381.030
pairs(zinb_10_batch@W, col=colMerged, pch=19, main="ZINB")
X has an intercept, columns for the batches, and columns with the first five PC on QC measures.
mod <- model.matrix(~ qcpca$x[, 1:5] + batch)
print(system.time(zinb_3_qc_batch <- zinbFit(core, ncores = 3, K = 3, X = mod)))
## user system elapsed
## 593.302 83.211 245.604
pairs(zinb_3_qc_batch@W, col=colMerged, pch=19, main="ZINB")
print(system.time(zinb_5_qc_batch <- zinbFit(core, ncores = 3, K = 5, X = mod)))
## user system elapsed
## 646.450 107.370 293.291
pairs(zinb_5_qc_batch@W, col=colMerged, pch=19, main="ZINB")
print(system.time(zinb_10_qc_batch <- zinbFit(core, ncores = 3, K = 10, X = mod)))
## user system elapsed
## 939.106 148.088 412.685
pairs(zinb_10_qc_batch@W, col=colMerged, pch=19, main="ZINB")
save(zinb_3, zinb_5, zinb_10,
zinb_3_qc, zinb_5_qc, zinb_10_qc,
zinb_3_batch, zinb_5_batch, zinb_10_batch,
zinb_3_qc_batch, zinb_5_qc_batch, zinb_10_qc_batch,
pc_tmm, pc_fq, pc_tc, pc_raw,
zifa_fq, zifa_tmm, zifa_tc, zifa_raw,
file="olfactory.rda")